Predict coverage

TopHat mode: GaT

library('ballgown')
library('GenomicRanges')
library('usefulstuff')
library('forecast')
library('ggplot2')
library('plyr')

Compute

Loads the coverage and splits it by gene for each sample. Then it calculates the correlation (and \(R^2\)) between the Geuvadis sample coverage and each of the simulated scenarios including simply drawing numbers from the negative binomial (aka, skipping alignment). For each Geuvadis gene-sample, an ARIMA model is fitted, then one more for each simulated scenario using the simulated coverage as a predictor. The coefficient for the predictor is extracted as well as its corresponding p-value. Similar numbers are calculated between replicates of the same simulation scenario.

## Load coverage data
load(file.path(tophat, 'ex_regCov.Rdata'))

## Define exons
gff <- gffReadGR(file.path('..', 'select_genes', 'twenty_genes.gtf'))
genes <- split(gff, strip_quotes(getAttributeField(as.character(gff$group),     
    'gene_id')))
exons <- unlist(reduce(genes))

## Split by gene
geneCov <- split(ex_regCov, names(exons))
geneCov <- mapply(function(x, y) {
    res <- do.call(rbind, x)
    ## Save the bases
    rownames(res) <- unlist(mapply(seq, start(exons[names(exons) == y]), end(exons[names(exons) == y])))
    return(res)
}, geneCov, names(geneCov), SIMPLIFY = FALSE)


## Split by sample
geu_samples <- c('NA06985', 'NA12144', 'NA12776', 'NA18858', 'NA20542', 
    'NA20772', 'NA20815')


set.seed(20150221)
sampleCov <- lapply(geneCov, function(x) {
    result <- lapply(seq_len(length(geu_samples)), function(i) {
        idx <- grepl(paste0(geu_samples[i], '|s', i), colnames(x))
        res <- x[, idx]
        colnames(res) <- c('Geuvadis', 'd1', 'd2', 'r1', 'r2')
        m <- mean(res$Geuvadis)
        res$b1a <- rnbinom(nrow(res), size = 1, mu = m)
        res$b1b <- rnbinom(nrow(res), size = 1, mu = m)
        res$b6a <- rnbinom(nrow(res), size = 6, mu = m)
        res$b6b <- rnbinom(nrow(res), size = 6, mu = m)
        return(res)
    })
    names(result) <- geu_samples
    return(result)
})


fitArima <- function(x, y = 'Geuvadis', order, d, auto = FALSE) {
    if(auto) {
        f <- auto.arima(d[, y], xreg = d[, x])
        order <- arimaorder(f)
    } else {
        f <- tryCatch(arima(d[, y], xreg = d[, x], order = order), error = function(e) { return(list(error = TRUE))})
    }
    
    if(!is.null(f$error)) return(list(order = NA, coef = NA, z = NA))
    
    z <- coef(f)["d[, x]"] / sqrt(f$var.coef["d[, x]", "d[, x]"])
    return(list(order = paste(order, collapse = '-'), coef = coef(f)["d[, x]"], z = z))
}

rmsd <- function(x, y = 'Geuvadis', d) {
    sqrt(mean( (d[, y] / max(d[, y]) - d[, x] / max(d[, x]))^2 ))
}

corr <- function(x, y = 'Geuvadis', d) { 
    cor(d[, y], d[, x])
}


sets <- c('d1', 'd2', 'r1', 'r2', 'b1a', 'b1b', 'b6a', 'b6b')
reps_a <- sets[rep(c(TRUE, FALSE), 4)]
reps_b <- sets[rep(c(FALSE, TRUE), 4)]

## Save as it takes about 20-30 mins to compute
if(file.exists(file.path(tophat, 'pred_cov.Rdata'))) {
    load(file.path(tophat, 'pred_cov.Rdata'))
} else {
    pred_cov <- lapply(names(sampleCov), function(gene) {
        res <- lapply(names(sampleCov[[gene]]), function(samp) {
            d <- sampleCov[[gene]][[samp]]
        
            ## Arima
            ord <- arimaorder(auto.arima(d$Geuvadis))
            info <- c(lapply(sets, fitArima, order = ord, d = d), mapply(fitArima, x = reps_b, y = reps_a, MoreArgs = list(d = d, order = ord), SIMPLIFY = FALSE))
            orders <- unlist(sapply(info, '[', 'order'))
            coefs <- unlist(sapply(info, '[', 'coef'))
            pvals <- pnorm(abs(unlist(sapply(info, '[', 'z'))), lower = FALSE) * 2
        
            ## RMSD
            RMSD <- c(sapply(sets, rmsd, d = d), mapply(rmsd, reps_b, reps_a, MoreArgs = list(d = d)))
        
            ## Cor and R2
            CORR <- c(sapply(sets, corr, d = d), mapply(corr, reps_b, reps_a, MoreArgs = list(d = d)))
        
        
            data.frame(set = c(sets, paste(reps_a, reps_b, sep = '-')), order = orders, coef = coefs, pval = pvals, rmsd = RMSD, cor = CORR, R2 = CORR^2, type = c(rep(c('d', 'r', 'b1', 'b6'), each = 2), 'd', 'r', 'b1', 'b6'), group = rep(c('Geuvadis', 'replicate'), c(8, 4)), gene = gene, sample = samp, row.names = NULL, stringsAsFactors = FALSE)
        })
        do.call(rbind, res)
    })
    pred_cov <- do.call(rbind, pred_cov)
    pred_cov$set <- factor(pred_cov$set, levels = c(sets, paste(reps_a, reps_b, sep = '-')))

    ## Significant pval?
    pred_cov$sig <- pred_cov$pval <= 0.05

    ## Extract ARIMA order info
    pred_cov$ar <- as.integer(substr(pred_cov$order, 1, 1))
    pred_cov$i <- as.integer(substr(pred_cov$order, 3, 3))
    pred_cov$ma <- as.integer(substr(pred_cov$order, 5, 5))
    save(pred_cov, file = file.path(tophat, 'pred_cov.Rdata'))
}

Models selected

Explore which ARIMA models were selected for each gene-sample scenario.

## Types of models selected
sort(table(pred_cov$order), decreasing = TRUE)
## 
## 1-1-1 2-1-2 0-1-2 0-2-1 2-2-1 3-1-2 0-1-1 0-1-4 1-1-2 1-2-1 1-1-3 2-1-0 
##   264   156    96    72    72    72    60    60    60    60    48    48 
## 3-1-3 4-1-3 5-1-4 5-1-5 5-2-0 0-1-0 0-2-2 0-1-5 1-1-4 2-1-3 2-1-4 2-1-5 
##    48    48    48    48    48    36    36    24    24    24    24    24 
## 2-2-3 5-2-5 1-1-5 2-2-2 2-2-4 3-1-1 3-1-4 3-2-1 3-2-2 4-1-2 4-1-4 4-2-1 
##    24    24    12    12    12    12    12    12    12    12    12    12 
## 5-1-3 
##    12
table(pred_cov$ar) /12
## 
##  0  1  2  3  4  5 
## 32 39 33 14  7 15
table(pred_cov$i) /12
## 
##   1   2 
## 107  33
table(pred_cov$ma) /12
## 
##  0  1  2  3  4  5 
## 11 47 38 17 16 11
ggplot(subset(pred_cov, set == 'd1'), aes(x = ar)) + geom_bar(stat = 'bin') + facet_grid(i ~ ma ) + theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

Significant coef

Percent of significant ARIMA coefficients for each of the simulated scenarios. Either overall percent, or first summarized by genes or by samples.

sig <- ddply(pred_cov, .(set, type), summarize, mean = mean(sig))
ggplot(sig, aes(x = set, y = mean * 100, fill = type)) + geom_boxplot() + ylim(c(0, 100)) + ylab('Percent') + ggtitle('Significant coefs') + theme_bw()

sig <- ddply(pred_cov, .(set, sample, type), summarize, mean = mean(sig))
ggplot(sig, aes(x = set, y = mean * 100, fill = type)) + geom_boxplot(outlier.shape = 5) + ylim(c(0, 100)) + ylab('Percent: 7 samples') + ggtitle('Significant coefs: by genes') + geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 1/2) + theme_bw()

sig <- ddply(pred_cov, .(set, gene, type), summarize, mean = mean(sig))
ggplot(sig, aes(x = set, y = mean * 100, fill = type)) + geom_boxplot(outlier.shape = 5) + ylim(c(0, 100)) + ylab('Percent: 20 genes') + ggtitle('Significant coefs: by samples') + geom_jitter(position = position_jitter(width = 0.2, height = 0), alpha = 1/2) + theme_bw()

Pval

Boxplots of ARIMA coefficient p-values. First overall, then by sample (20 points per boxplot: 1 per gene), then by gene (7 points per boxplot).

ggplot(pred_cov, aes(x = set, y = pval, fill = type)) + geom_boxplot() + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

ggplot(pred_cov, aes(x = set, y = pval, fill = type)) + geom_boxplot() + facet_grid(. ~ sample) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

ggplot(pred_cov, aes(x = set, y = pval, fill = type)) + geom_boxplot() + facet_wrap( ~ gene, ncol = 10) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

Coef

Boxplots of ARIMA coefficients. First overall, then by sample (20 points per boxplot: 1 per gene), then by gene (7 points per boxplot).

ggplot(pred_cov, aes(x = set, y = coef, fill = type)) + geom_boxplot() + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

ggplot(pred_cov, aes(x = set, y = coef, fill = type)) + geom_boxplot() + facet_grid(. ~ sample) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

ggplot(pred_cov, aes(x = set, y = coef, fill = type)) + geom_boxplot() + facet_wrap( ~ gene, ncol = 10) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

Pval vs Coef

Relationship between p-values and coefficients for the ARIMA models.

ggplot(pred_cov, aes(x = pval, y = coef, color = type)) + geom_point() + facet_grid(group ~ .) + geom_vline(xintercept = 0.05, colour = 'red') + theme_bw()

ggplot(pred_cov, aes(x = pval, y = coef, color = type)) + geom_point() + facet_grid(group ~ sample) + geom_vline(xintercept = 0.05, colour = 'red') + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

ggplot(pred_cov, aes(x = pval, y = coef, color = type)) + geom_point() + facet_grid(group ~ gene) + geom_vline(xintercept = 0.05, colour = 'red') + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

RMSD-max

Boxplots of root mean square distance where each vector has been standardized by its maximum. First overall, then by sample (20 points per boxplot: 1 per gene), then by gene (7 points per boxplot).

ggplot(pred_cov, aes(x = set, y = rmsd, fill = type)) + geom_boxplot() + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

ggplot(pred_cov, aes(x = set, y = rmsd, fill = type)) + geom_boxplot() + facet_grid(. ~ sample) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

ggplot(pred_cov, aes(x = set, y = rmsd, fill = type)) + geom_boxplot() + facet_wrap( ~ gene, ncol = 10) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

Correlation

Boxplots of correlation. First overall, then by sample (20 points per boxplot: 1 per gene), then by gene (7 points per boxplot).

ggplot(pred_cov, aes(x = set, y = cor, fill = type)) + geom_boxplot() + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

ggplot(pred_cov, aes(x = set, y = cor, fill = type)) + geom_boxplot() + facet_grid(. ~ sample) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

ggplot(pred_cov, aes(x = set, y = cor, fill = type)) + geom_boxplot() + facet_wrap( ~ gene, ncol = 10) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

Cor vs RMSD-max

Comparison between correlation and RMSD-max.

ggplot(pred_cov, aes(x = rmsd, y = cor, color = type)) + geom_point() + facet_grid(group ~ .) + theme_bw()

ggplot(pred_cov, aes(x = rmsd, y = cor, color = type)) + geom_point() + facet_grid(group ~ sample) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

ggplot(pred_cov, aes(x = rmsd, y = cor, color = type)) + geom_point() + facet_grid(group ~ gene) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

\(R^2\)

Boxplots of \(R^2\) (correlation squared since its a simple linear model). First overall, then by sample (20 points per boxplot: 1 per gene), then by gene (7 points per boxplot).

ggplot(pred_cov, aes(x = set, y = R2, fill = type)) + geom_boxplot() + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

ggplot(pred_cov, aes(x = set, y = R2, fill = type)) + geom_boxplot() + facet_grid(. ~ sample) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8)) 

ggplot(pred_cov, aes(x = set, y = R2, fill = type)) + geom_boxplot() + facet_wrap( ~ gene, ncol = 10) + theme_bw() + theme(axis.text.x  = element_text(angle=90, vjust=0.5, size=8))

Reproducibility

## Reproducibility info
# Time spent in this report
diff(c(startTime, Sys.time()))
## Time difference of 1.17969 hours
Sys.time()
## [1] "2015-02-23 05:06:40 EST"
options(width = 120)
devtools::session_info()
## Session info -----------------------------------------------------------------------------------------------------------
##  setting  value                                      
##  version  R version 3.1.1 Patched (2014-10-16 r66782)
##  system   x86_64, linux-gnu                          
##  ui       X11                                        
##  language (EN)                                       
##  collate  en_US.UTF-8                                
##  tz       <NA>
## Packages ---------------------------------------------------------------------------------------------------------------
##  package           * version  date       source                                   
##  annotate          * 1.44.0   2014-10-15 Bioconductor                             
##  AnnotationDbi     * 1.28.1   2014-10-29 Bioconductor                             
##  ballgown            1.99.3   2015-02-20 Github (alyssafrazee/ballgown@7d8fb1c)   
##  base64enc         * 0.1-2    2014-06-26 CRAN (R 3.1.0)                           
##  BatchJobs         * 1.5      2014-10-30 CRAN (R 3.1.1)                           
##  BBmisc            * 1.9      2015-02-03 CRAN (R 3.1.1)                           
##  Biobase           * 2.26.0   2014-10-15 Bioconductor                             
##  BiocGenerics        0.12.1   2014-11-15 Bioconductor                             
##  BiocParallel      * 1.0.3    2015-02-09 Bioconductor                             
##  Biostrings        * 2.34.1   2014-12-13 Bioconductor                             
##  bitops            * 1.0-6    2013-08-17 CRAN (R 3.1.0)                           
##  brew              * 1.0-6    2011-04-13 CRAN (R 3.1.0)                           
##  Cairo             * 1.5-6    2014-06-26 CRAN (R 3.1.0)                           
##  checkmate         * 1.5.1    2014-12-14 CRAN (R 3.1.1)                           
##  codetools         * 0.2-9    2014-08-21 CRAN (R 3.1.1)                           
##  colorspace        * 1.2-4    2013-09-30 CRAN (R 3.1.0)                           
##  DBI               * 0.3.1    2014-09-24 CRAN (R 3.1.1)                           
##  devtools          * 1.7.0    2015-01-17 CRAN (R 3.1.1)                           
##  digest            * 0.6.8    2014-12-31 CRAN (R 3.1.1)                           
##  evaluate          * 0.5.5    2014-04-29 CRAN (R 3.1.0)                           
##  fail              * 1.2      2013-09-19 CRAN (R 3.1.0)                           
##  foreach           * 1.4.2    2014-04-11 CRAN (R 3.1.0)                           
##  forecast            5.8      2015-01-06 CRAN (R 3.1.1)                           
##  formatR           * 1.0      2014-08-25 CRAN (R 3.1.1)                           
##  fracdiff          * 1.4-2    2012-12-02 CRAN (R 3.1.1)                           
##  genefilter        * 1.48.1   2014-10-17 Bioconductor                             
##  GenomeInfoDb        1.2.4    2014-12-20 Bioconductor                             
##  GenomicAlignments * 1.2.1    2014-11-05 Bioconductor                             
##  GenomicRanges       1.18.4   2015-01-08 Bioconductor                             
##  ggplot2             1.0.0    2014-05-21 CRAN (R 3.1.0)                           
##  gtable            * 0.1.2    2012-12-05 CRAN (R 3.1.0)                           
##  htmltools         * 0.2.6    2014-09-08 CRAN (R 3.1.1)                           
##  IRanges             2.0.1    2014-12-13 Bioconductor                             
##  iterators         * 1.0.7    2014-04-11 CRAN (R 3.1.0)                           
##  knitr             * 1.9      2015-01-20 CRAN (R 3.1.1)                           
##  labeling          * 0.3      2014-08-23 CRAN (R 3.1.1)                           
##  lattice           * 0.20-29  2014-04-04 CRAN (R 3.1.1)                           
##  limma             * 3.22.4   2015-01-16 Bioconductor                             
##  MASS              * 7.3-35   2014-09-30 CRAN (R 3.1.1)                           
##  Matrix            * 1.1-4    2014-06-15 CRAN (R 3.1.1)                           
##  mgcv              * 1.8-3    2014-08-29 CRAN (R 3.1.1)                           
##  munsell           * 0.4.2    2013-07-11 CRAN (R 3.1.0)                           
##  nlme              * 3.1-118  2014-10-07 CRAN (R 3.1.1)                           
##  nnet              * 7.3-8    2014-03-28 CRAN (R 3.1.1)                           
##  plyr                1.8.1    2014-02-26 CRAN (R 3.1.0)                           
##  proto             * 0.3-10   2012-12-22 CRAN (R 3.1.0)                           
##  quadprog          * 1.5-5    2013-04-17 CRAN (R 3.1.0)                           
##  RColorBrewer      * 1.1-2    2014-12-07 CRAN (R 3.1.1)                           
##  Rcpp              * 0.11.4   2015-01-24 CRAN (R 3.1.1)                           
##  RCurl             * 1.95-4.5 2014-12-06 CRAN (R 3.1.1)                           
##  reshape2          * 1.4.1    2014-12-06 CRAN (R 3.1.1)                           
##  rmarkdown           0.5.1    2015-01-26 CRAN (R 3.1.1)                           
##  Rsamtools         * 1.18.2   2014-11-12 Bioconductor                             
##  RSQLite           * 1.0.0    2014-10-25 CRAN (R 3.1.1)                           
##  rstudioapi        * 0.2      2014-12-31 CRAN (R 3.1.1)                           
##  rtracklayer       * 1.26.2   2014-11-12 Bioconductor                             
##  S4Vectors           0.4.0    2014-10-15 Bioconductor                             
##  scales            * 0.2.4    2014-04-22 CRAN (R 3.1.0)                           
##  sendmailR         * 1.2-1    2014-09-21 CRAN (R 3.1.1)                           
##  stringr           * 0.6.2    2012-12-06 CRAN (R 3.1.0)                           
##  survival          * 2.37-7   2014-01-22 CRAN (R 3.1.1)                           
##  sva               * 3.12.0   2014-10-15 Bioconductor                             
##  timeDate            3012.100 2015-01-23 CRAN (R 3.1.1)                           
##  tseries           * 0.10-33  2015-02-10 CRAN (R 3.1.1)                           
##  usefulstuff         1.01     2015-02-19 Github (alyssafrazee/usefulstuff@b260fe9)
##  XML               * 3.98-1.1 2013-06-20 CRAN (R 3.1.0)                           
##  xtable            * 1.7-4    2014-09-12 CRAN (R 3.1.1)                           
##  XVector           * 0.6.0    2014-10-15 Bioconductor                             
##  yaml              * 2.1.13   2014-06-12 CRAN (R 3.1.1)                           
##  zlibbioc          * 1.12.0   2014-10-15 Bioconductor                             
##  zoo                 1.7-11   2014-02-27 CRAN (R 3.1.0)